pacman::p_load(readxl,readr,dplyr,stringi,stringr,tidyr)
rm(list=ls())
################################################################################

############################# Argentina 

#### Agricultural employment
province_list <- c('buenosaires','catamarca','chaco','chubut','cordoba','corrientes','entrerios',
                   'formosa','jujuy','lapampa','larioja','mendoza','misiones','neuquen','rionegro',
                   'salta','sanjuan','sanluis','santacruz','santafe','santiagodelestero',
                   'tierradelfuego','tucuman')
province_name <- c('Buenos Aires','Catamarca','Chaco','Chubut','Cordoba','Corrientes','Entre Rios',
                   'Formosa','Jujuy','La Pampa','La Rioja','Mendoza','Misiones','Neuquen','Rio Negro',
                   'Salta','San Juan','San Luis','Santa Cruz','Santa Fe','Santiago del Estero',
                   'Tierra del Fuego','Tucuman')
df_2018 <- NULL
for (i in seq_along(province_list)) {
  sheet <- province_list[i]
  state <- province_name[i]
  df <- read_excel('../../data/factors/raw/labor_argentina.xlsx', sheet = sheet, .name_repair = "unique_quiet")
  
  df <- df %>% fill(Partido, .direction = "down") %>% filter(row_number() %% 2 == 0)
  df <- df[ , -c(1, 3)]  
  df <- df %>% mutate(year = "2018",state = state) %>%
    relocate(year, .before = 1) %>%
    relocate(state, .before = 2) %>%
    rename(county = Partido,employment_agriculture = Total) %>%
    mutate(across(c(county, state), ~ stri_trans_general(., "Latin-ASCII"))) %>%
    mutate(across(c(county, state), ~ toupper(.))) %>%
    mutate(across(c(county, state), ~ str_trim(.))) %>%
    filter(!is.na(county) & county != "NAN")
  if (is.null(df_2018)) {df_2018 <- df} else {df_2018 <- bind_rows(df_2018, df) }
}

df_2018$county <- str_replace_all(df_2018$county, c(
  "1° DE MAYO" = "PRIMERO DE MAYO",
  "12 DE OCTUBRE" = "DOCE DE OCTUBRE",
  "2 DE ABRIL" = "DOS DE ABRIL",
  "25 DE MAYO" = "VEINTICINCO DE MAYO",
  "9 DE JULIO" = "NUEVE DE JULIO",
  "ADOLFO GONZALES CHAVES" = "GONZALES CHAVES",
  "CORONEL DE MARINA LEONARDO ROSALES"="CORONEL ROSALES",
  "CORONEL DE MARINA L ROSALES" = "CORONEL ROSALES",
  "CONCEPCION DE LA SIERRA" = "CONCEPCION",
  "FLORENTINO AMEGHINO" = "AMEGHINO",
  "GENERAL JUAN MADARIAGA" = "GENERAL MADARIAGA",
  "LDOR GRAL SAN MARTIN" = "LIBERTADOR GENERAL SAN MARTIN",
  "LEANDRO N ALEM" = "LEANDRO N. ALEM",
  "GRAL BELGRANO" = "GENERAL BELGRANO",
  "LA CANDELARIA" = "CANDELARIA",
  "DR MANUEL BELGRANO" = "GENERAL MANUEL BELGRANO",
  "LA CAPITAL" = "CAPITAL"))
df_1 <- df_2018


#### All employment
df <- read_csv("../../data/factors/raw/poblacion_identificada_departamento_agosto_2023.csv",show_col_types = FALSE)
df <- df %>% rename(state_id = provincia_id,state = nombre_provincia,county_id = departamento_id,
                    county = nombre_departamento,gender = sexo,age_group = edad_quinquenal,nationality = nacionalidad,
                    country_of_birth = pais_nacimiento,employment_all = cantidad)
df$year <- 2023
df <- df %>% mutate(across(c(state, county), ~ stri_trans_general(., "Latin-ASCII"))) %>%
  mutate(across(c(state, county), ~ str_to_upper(.))) %>% mutate(across(c(state, county), ~ str_trim(.)))
df$county <- df$county %>%
  recode(
    '1° DE MAYO' = 'PRIMERO DE MAYO',
    '12 DE OCTUBRE' = 'DOCE DE OCTUBRE',
    '2 DE ABRIL' = 'DOS DE ABRIL',
    '25 DE MAYO' = 'VEINTICINCO DE MAYO',
    '9 DE JULIO' = 'NUEVE DE JULIO',
    'ADOLFO GONZALES CHAVES' = 'GONZALES CHAVES',
    'CORONEL DE MARINA L ROSALES' = 'CORONEL ROSALES',
    'CORONEL DE MARINA LEONARDO ROSALES' = 'CORONEL ROSALES',
    'CONCEPCION DE LA SIERRA' = 'CONCEPCION',
    'FLORENTINO AMEGHINO' = 'AMEGHINO',
    'GENERAL JUAN MADARIAGA' = 'GENERAL MADARIAGA',
    'LDOR GRAL SAN MARTIN' = 'LIBERTADOR GENERAL SAN MARTIN',
    'LEANDRO N ALEM' = 'LEANDRO N. ALEM',
    'GRAL BELGRANO' = 'GENERAL MANUEL BELGRANO',
    'LA CANDELARIA' = 'CANDELARIA',
    'DR MANUEL BELGRANO' = 'CAPITAL',
    'LA CAPITAL' = 'CAPITAL'
  )

#Remove duplicates
df <- df %>% dplyr::select(year, state, county, gender, age_group, nationality, country_of_birth, employment_all)
df <- df %>% group_by(year, state, county, gender, age_group, nationality, country_of_birth) %>% summarise(employment_all = median(employment_all), .groups = "drop")

#Filter age group
working_age <- c('20 a 24','25 a 29','30 a 34','35 a 39','40 a 44','45 a 49','50 a 54','55 a 59','60 a 64')
df <- df %>% filter(age_group %in% working_age)
df_2 <- df %>% group_by(state, county) %>% summarise(employment_all = sum(employment_all), .groups = "drop") %>%
  mutate(employment_all = as.integer(employment_all))

#Merge
df <- read_csv("../../data/landuse/clean/geographicunits_argentina.csv.gz",show_col_types = FALSE)
df_3 <- df  
df <- df_3 %>% inner_join(df_1, by = c("state", "county")) %>% mutate(across(everything(), ~ replace_na(., 0)))
df <- df %>% inner_join(df_2, by = c("state", "county")) %>% mutate(across(everything(), ~ replace_na(., 0)))
write_csv(df, "../../data/factors/clean/employment_argentina.csv.gz")



############################# Brazil 

#### Agricultural employment
df_1 <- NULL
df_2 <- NULL
for (n_p in 0:1) {
  if (n_p == 0) {
    df <- read_excel("../../data/factors/raw/tabela6884_all.xlsx", sheet = 1, .name_repair = "unique_quiet")
  } else {
    df <- read_excel("../../data/factors/raw/tabela6884_family.xlsx", sheet = 1, .name_repair = "unique_quiet")
  }
  
  df <- df[-nrow(df), ]
  colnames(df) <- c("county_id", "county", "producer_type", "activity_type", "employment")
  df <- df[7:nrow(df), ]
  df <- df %>%  mutate( county_id = as.character(county_id), county_id = ifelse(county_id == "" | county_id == "BRnan", NA, county_id) ) %>%
    fill(county_id, .direction = "down") %>%  mutate(county_id = paste0("BR", county_id))
  df <- df %>% mutate(producer_type = as.character(producer_type),producer_type = ifelse(producer_type == "nan", NA, producer_type)) %>%
    fill(producer_type, .direction = "down")
  df <- df %>% mutate(across(everything(), ~ str_replace_all(., c(
    "X" = "0",
    "_" = "0",
    "Total" = "total",
    "Produção de lavouras temporárias" = "seasonal crops",
    "Produção de lavouras permanentes" = "permanent crops",
    "Pecuária e criação de outros animais" = "livestock",
    "Proprietário\\(a\\)" = "owner",
    "Concessionário\\(a\\) ou assentado\\(a\\) aguardando titulação definitiva" = "settled and awaiting final deed",
    "Arrendatário\\(a\\)" = "tenant",
    "Parceiro\\(a\\)" = "partner",
    "Comodatário\\(a\\)" = "lessee",
    "Ocupante" = "occupant",
    "Produtor sem área" = "producer without area"))))
  df <- df %>% mutate(employment = as.numeric(employment),year = 2017, ties = ifelse(n_p == 0, "all", "only_family"))
  df <- df %>% dplyr::select(-county)
  df <- df %>% replace(is.na(.), 0)
  if (n_p == 0) { df_1 <- df  } else { df_2 <- df }
}
df_ag <- bind_rows(df_1, df_2)

#### All employment
df <- read_excel("../../data/factors/raw/tabela1572.xlsx", sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
colnames(df) <- c("county_id", "county", "type", "employment")
df <- df[5:nrow(df), ]  
df$county_id <- paste0("BR", as.character(df$county_id))
df <- df %>% mutate(across(everything(), ~replace_na(., 0)))
df <- df %>% mutate(employment = as.numeric(employment))
df$year <- 2010
df$producer_type <- "EAP"
df$activity_type <- "EAP"
df$ties <- "NA"
df_all <- df %>% dplyr::select(county_id, producer_type, activity_type, employment, year, ties)
#Merge
df_final <- bind_rows(df_ag, df_all)
write_csv(df_final, "../../data/factors/clean/employment_brazil.csv.gz")

#### Wages
df <- read_excel("../../data/factors/raw/tabela3553.xlsx", sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
colnames(df) <- c("county_id", "county", "education", "contributor_status", "age_group", "income_urban", "income_rural")
df <- df[-c(1:6), ]
df$county_id <- paste0("BR", df$county_id)
df[is.na(df)] <- 0
df$year <- 2010
df$income_urban <- as.numeric(df$income_urban)
df$income_rural <- as.numeric(df$income_rural)
df_wages <- df %>% dplyr::select(county_id, income_urban, income_rural, year)
write_csv(df_wages, "../../data/factors/clean/wages_brazil.csv.gz")
